#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os
import pandas as pd
import numpy as np
os.chdir("C:\\Users\gmaze\\Documents\\GitHub\\job_insecurity\\data\\other")
## Set your working directory here


# In[ ]:


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'vect__max_df': (.75, 1.0),
}

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__alpha': (1e-2, 1e-3),
                  'clf-svm__penalty': ('l2', 'l1', 'elasticnet'),
                  'clf-svm__loss': ('hinge', 'modified_huber'),
                  'vect__max_features': (1000, 1500),
                  'vect__max_df': (.75, 1.0),
}

parameters_ridge = {'clf-ridge__fit_intercept': (True, False),
    
}

count_vect = CountVectorizer(max_features=1500)
tfidf_transformer=TfidfTransformer()

import nltk
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(min_df=2, stop_words='english')


# In[ ]:


text_clf = Pipeline([('vect', StemmedCountVectorizer(max_features=1500, min_df=2, stop_words='english')),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB(fit_prior=True)),
])

text_clf_svm = Pipeline([('vect', StemmedCountVectorizer(stop_words='english')),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', 
                                                random_state=42)),
])

text_clf_ridge = Pipeline([('vect', StemmedCountVectorizer(max_features=1500, min_df=2, stop_words='english')),
                           ('tfidf', TfidfTransformer()),
                           ('clf-ridge', RidgeClassifierCV()),
])


# In[2]:


# Adding full text to the text to be analyzed

data = pd.read_excel("raw\\abstracts_intros_classified.xlsx")
data.columns=['index','id','abstract','class','full_text','class2']
data = data.drop(['index','class2'], axis=1)
data['full_text'].fillna(' a', inplace=True)
data['text']=data['abstract']+" "+data['full_text']

data = data[data.text!="a a"]
data_original = data
data = data.dropna()
data = data.sort_values(by=["class"])

data_train = pd.DataFrame(data["text"])
data_train = data_train.values.tolist()
data_train = [i[0] for i in data_train]
classes_train = pd.DataFrame(data["class"])
classes_train = classes_train.to_numpy()
classes_train = classes_train.transpose()
classes_train = classes_train.ravel()

data_all = pd.DataFrame(data_original["text"])
data_all = data_all.values.tolist()
data_all = [i[0] for i in data_all]
classes_all = pd.DataFrame(data_original["class"])
classes_all = classes_all.to_numpy()
classes_all = classes_all.transpose()
classes_all = classes_all.ravel()


# In[ ]:


#Stemmed text, SVM grid search

text_clf_svm = text_clf_svm.fit(data_train, classes_train)

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(data_train, classes_train)
gs_clf_svm.best_score_


# In[ ]:


predictions = gs_clf_svm.predict(data_all)

ids = data_original[['id']]
predictions_df = pd.DataFrame(predictions)
ids.to_excel("raw\\ids.xlsx")
predictions_df.to_excel("raw\\predictions.xlsx")

